Photo by Prasad Panchakshari on Unsplash
We've combined data from multiple sources:

#load common utilities
import os
from datetime import datetime
import glob
#load data analysis packages
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sb
sb.set()
import squarify
from pywaffle import Waffle
import folium #needed for interactive map
from folium.plugins import HeatMap
import geopy
from geopy.distance import geodesic
#set display options
from pylab import rcParams
rcParams['figure.figsize'] = 15, 10
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', None)
from IPython.display import HTML, IFrame
#set working directory
rootdir = os.getcwd()
data_path = "../data"
import sys
sys.path.append('../src/scripts/')
import sg
philly_center = [39.9526, -75.1652]
SafeGraph is an industry-leading data company that brings together highly accturate data from thousands of sources to deliver deep insights on millions of locations where consumers spend their money. This data is updated monthly to accurately account for store openings and closings. Safegraph offers three categories of data on their POI locations:
"Our customers trust SafeGraph data to be clean and free of unnecessary noise. Our algorithms work over-time to remove and filter out irrelevant POIs and errors in the data. Better data means better decisions for your business." - SafeGraph
Below we will be taking a look at a monthly data extract containing POI data for Wawa locations in and around the Philadelphia area. In addition to the main dataset, we will be joining it to an additional file that Sagefraph provides that details the sample size of devices measured in each census block group (we'll explore this further a little later on). Safegraph states that they get a ~10% sample size of device data from their providers, and does their best to make sure they have a representative demographic sample. You can read more about how they tackle bias in their data collection in this blog post.
We will also be joining this data with SafeGraph's Open Census Data, a standardized and processed version of the U.S. Census Bureau's American Community Survey, to get the estimated population of each census block group.
#define datatypes
dtypes = {
"naics_code" : "object",
"postal_code": "object",
"phone_number": "object",
"poi_cbg": "object"
}
#read in SafeGraph data, joined with additional datasets
safegraph_wawa_october = pd.read_csv(f"{data_path}/raw/safegraph/wawa/oct/oct.csv", dtype=dtypes).merge(
pd.read_csv(f'{data_path}/raw/safegraph/wawa/oct/home_panel_summary.csv', usecols=['census_block_group', 'number_devices_residing'], dtype={"census_block_group": "object"}),
left_on='poi_cbg', right_on='census_block_group', how='left'
).merge(
pd.read_csv(f"{data_path}/raw/census/data/cbg_b01.csv", usecols=['census_block_group','B01001e1'], dtype={'census_block_group': 'object'}),
on='census_block_group', how='left'
).rename(columns={"B01001e1":'cbg_pop'})
#read in additional census data for later
cbgs = pd.read_csv(f"{data_path}/raw/census/metadata/cbg_geographic_data.csv", usecols=['census_block_group', 'longitude', 'latitude'], dtype={"census_block_group": "object"})
safegraph_wawa_october.head()
| placekey | safegraph_place_id | parent_placekey | parent_safegraph_place_id | location_name | safegraph_brand_ids | brands | top_category | sub_category | category_tags | naics_code | latitude | longitude | street_address | city | region | postal_code | iso_country_code | phone_number | open_hours | opened_on | closed_on | tracking_opened_since | tracking_closed_since | date_range_start | date_range_end | raw_visit_counts | raw_visitor_counts | visits_by_day | visitor_home_cbgs | visitor_country_of_origin | distance_from_home | median_dwell | bucketed_dwell_times | related_same_day_brand | related_same_month_brand | popularity_by_hour | popularity_by_day | device_type | poi_cbg | visitor_daytime_cbgs | census_block_group | number_devices_residing | cbg_pop | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | zzw-222@628-p8n-cnq | sg:57221f3709624c3b8fd6df6e07e8ec1b | NaN | NaN | Wawa | SG_BRAND_fa5bda016924cdc09a52f985ffb8b6e9 | Wawa | Grocery Stores | Convenience Stores | NaN | 445120 | 40.036273 | -75.070238 | 6460 Bustleton Ave | Philadelphia | PA | 19149 | US | +12155332110 | NaN | NaN | NaN | 2019-12 | 2019-07 | 2020-10-01T00:00:00-04:00 | 2020-11-01T00:00:00-04:00 | 733.0 | 515.0 | [29,28,21,20,26,25,21,29,23,33,22,23,32,19,19,... | {"421010312002":12,"421010318001":12,"42101031... | {"US":486} | 4134.0 | 9.0 | {"<5":44,"5-10":393,"11-20":161,"21-60":85,"61... | {"LUKOIL North America":50,"Philadelphia Sport... | {"Dunkin'":46,"Sunoco":36,"Dollar Tree":27,"Mc... | [21,21,26,20,22,31,61,73,57,59,63,48,61,70,62,... | {"Monday":106,"Tuesday":100,"Wednesday":84,"Th... | {"android":353,"ios":161} | 421010313003 | {"421010312002":9,"421010313004":9,"4210103130... | 421010313003 | 287.0 | 1074.0 |
| 1 | 222-222@63s-dw8-bx5 | sg:14ce4ee122e14a7a8f41b969af9e46e9 | NaN | NaN | Wawa | SG_BRAND_fa5bda016924cdc09a52f985ffb8b6e9 | Wawa | Grocery Stores | Convenience Stores | NaN | 445120 | 39.891192 | -75.241896 | 8220 Bartram Ave | Philadelphia | PA | 19153 | US | +12154920824 | NaN | NaN | NaN | 2019-12 | 2019-07 | 2020-10-01T00:00:00-04:00 | 2020-11-01T00:00:00-04:00 | 2215.0 | 1459.0 | [104,76,71,67,68,53,61,89,70,64,83,64,59,65,93... | {"421010055003":13,"421010063004":12,"42101003... | {"US":1358,"BS":4} | 17269.0 | 10.0 | {"<5":105,"5-10":1161,"11-20":462,"21-60":290,... | {} | {"Dunkin'":36,"Sunoco":32,"McDonald's":28,"Wal... | [93,80,69,91,120,114,172,177,219,182,194,188,1... | {"Monday":273,"Tuesday":249,"Wednesday":247,"T... | {"android":887,"ios":570} | 421019809001 | {"421019809001":28,"420454028003":12,"42101005... | 421019809001 | 219.0 | 4.0 |
| 2 | 224-222@628-qx3-grk | sg:f6b2db814d5f440a98af73e502b160e4 | NaN | NaN | Wawa | SG_BRAND_fa5bda016924cdc09a52f985ffb8b6e9 | Wawa | Grocery Stores | Convenience Stores | NaN | 445120 | 39.802768 | -74.950869 | 201 W White Horse Pike | Berlin | NJ | 08009 | US | +18567685117 | NaN | NaN | NaN | 2019-12 | 2019-07 | 2020-10-01T00:00:00-04:00 | 2020-11-01T00:00:00-04:00 | 1298.0 | 765.0 | [32,44,56,44,33,38,48,53,70,49,47,36,36,31,32,... | {"340076087004":69,"340076087003":38,"34007608... | {"US":755} | 6669.0 | 7.0 | {"<5":130,"5-10":873,"11-20":161,"21-60":78,"6... | {"Applebee's":6} | {"Dunkin'":45,"Walmart":39,"McDonald's":37,"Ch... | [11,13,13,11,15,24,77,79,100,94,69,91,117,112,... | {"Monday":131,"Tuesday":150,"Wednesday":157,"T... | {"android":389,"ios":380} | 340076087003 | {"340076087004":65,"340076087003":39,"34007608... | 340076087003 | 105.0 | 1251.0 |
| 3 | zzy-222@63s-dvs-yqf | sg:7f5feacb04de468eb7d92880813b5389 | NaN | NaN | Wawa | SG_BRAND_fa5bda016924cdc09a52f985ffb8b6e9 | Wawa | Grocery Stores | Convenience Stores | NaN | 445120 | 39.941005 | -75.272337 | 67 N Lansdowne Ave | Lansdowne | PA | 19050 | US | +16106235430 | NaN | NaN | NaN | 2019-12 | 2019-07 | 2020-10-01T00:00:00-04:00 | 2020-11-01T00:00:00-04:00 | 915.0 | 515.0 | [25,38,33,19,34,24,30,33,38,40,21,27,20,28,26,... | {"420454018003":16,"420454018002":11,"42045401... | {"US":487} | 3002.0 | 9.0 | {"<5":49,"5-10":544,"11-20":154,"21-60":81,"61... | {"CVS":8,"Trinity Health":6,"Popeyes Louisiana... | {"Dunkin'":37,"Sunoco":34,"McDonald's":29,"CVS... | [24,23,20,23,15,26,69,102,139,113,104,105,122,... | {"Monday":109,"Tuesday":108,"Wednesday":130,"T... | {"android":380,"ios":136} | 420454018003 | {"420454018003":16,"420454020001":11,"42101008... | 420454018003 | 66.0 | 1280.0 |
| 4 | 229-222@628-p8d-z2k | sg:bc8b48dfd5734efeaf9d509068552e5f | NaN | NaN | Wawa | SG_BRAND_fa5bda016924cdc09a52f985ffb8b6e9 | Wawa | Grocery Stores | Convenience Stores | NaN | 445120 | 40.012014 | -75.068135 | 5597 Tulip St | Philadelphia | PA | 19124 | US | +12677687176 | NaN | NaN | NaN | 2019-12 | 2019-07 | 2020-10-01T00:00:00-04:00 | 2020-11-01T00:00:00-04:00 | 1365.0 | 818.0 | [56,61,51,39,26,44,47,42,48,47,38,55,59,54,47,... | {"421010299004":18,"421010321001":13,"42101018... | {"US":769} | 3622.0 | 10.0 | {"<5":62,"5-10":732,"11-20":284,"21-60":182,"6... | {"CVS":8} | {"Dunkin'":46,"Sunoco":30,"McDonald's":29,"Dol... | [8,1,7,10,43,51,89,104,121,109,125,118,129,115... | {"Monday":180,"Tuesday":200,"Wednesday":183,"T... | {"android":507,"ios":309} | 421010298004 | {"421010299004":15,"421010381001":14,"42101029... | 421010298004 | 50.0 | 1304.0 |
get_sg_data() to repeat this process for any POI and month¶def get_sg_data(poi_name, month):
dtypes = {
"naics_code" : "object",
"postal_code": "object",
"phone_number": "object",
"poi_cbg": "object"
}
#read in SafeGraph data, joined with additional datasets
safegraph_df = pd.read_csv(f"{data_path}/raw/safegraph/{poi_name}/{month}/{month}.csv", dtype=dtypes).merge(
pd.read_csv(f'{data_path}/raw/safegraph/{poi_name}/{month}/home_panel_summary.csv', usecols=['census_block_group', 'number_devices_residing'], dtype={"census_block_group": "object"}),
left_on='poi_cbg', right_on='census_block_group', how='left'
).merge(
pd.read_csv(f"{data_path}/raw/census/data/cbg_b01.csv", usecols=['census_block_group','B01001e1'], dtype={'census_block_group': 'object'}),
on='census_block_group', how='left'
).rename(columns={"B01001e1":'cbg_pop'})
return safegraph_df
As you can see, there is a wealth of information about each of our store locations. When looking at the Patterns portion of the dataset, it becomes apparent that these columns are not easily able to be processed right away due to the different data embeddings encoded as strings. However, with a little elbow grease and Python data manipulation magic, we can uncover some pretty fascinating insights about our POI locations.
sg module¶In order assist with all of the data manipulation steps needed to disect the foot traffic pattern columns and massage them into a workable format, we created and packaged several functions under the sg module to more efficiently process each store location. The source code of this module can be found at /src/scripts/sg.py.
store = safegraph.iloc[0]
store
placekey zzw-222@628-p8n-cnq
safegraph_place_id sg:57221f3709624c3b8fd6df6e07e8ec1b
parent_placekey NaN
parent_safegraph_place_id NaN
location_name Wawa
...
poi_cbg 421010313003
visitor_daytime_cbgs {"421010312002":9,"421010313004":9,"4210103130...
census_block_group 421010313003
number_devices_residing 287.0
cbg_pop 1074.0
Name: 0, Length: 44, dtype: object
visits_by_day column using the sg.calc_visits_by_day() function¶The visits_by_day column of the SageFraph dataset containes a string-encoded list of the number of visits to the POI location each day of the month. This list doesn't become useful until to map the values to the actual days and dates they correspond to, so that's what the sg.calc_visits_by_day() function is designed to do. In a nutshell, it uses the date_range_start and date_range_end columns from the POI record to create a pandas date_range index to create a row for every day in the date range. Then, it takes the list of daily visitsand maps the first value to day 1, second value to day 2, and so on. Calling the function,which takes a pandas Series object representing a single store and observation in the dataset and returns the below dataframe:
#output the raw visits_by_day_column
print(store.visits_by_day)
#use sg.calc_visits_by_day to generate daily visit dataframe
visits = sg.calc_visits_by_day(store)
visits
[29,28,21,20,26,25,21,29,23,33,22,23,32,19,19,22,36,26,21,22,33,24,33,35,30,36,21,11,2,4,7]
| id | visits | |
|---|---|---|
| 2020-10-01 | sg:57221f3709624c3b8fd6df6e07e8ec1b | 29 |
| 2020-10-02 | sg:57221f3709624c3b8fd6df6e07e8ec1b | 28 |
| 2020-10-03 | sg:57221f3709624c3b8fd6df6e07e8ec1b | 21 |
| 2020-10-04 | sg:57221f3709624c3b8fd6df6e07e8ec1b | 20 |
| 2020-10-05 | sg:57221f3709624c3b8fd6df6e07e8ec1b | 26 |
| ... | ... | ... |
| 2020-10-27 | sg:57221f3709624c3b8fd6df6e07e8ec1b | 21 |
| 2020-10-28 | sg:57221f3709624c3b8fd6df6e07e8ec1b | 11 |
| 2020-10-29 | sg:57221f3709624c3b8fd6df6e07e8ec1b | 2 |
| 2020-10-30 | sg:57221f3709624c3b8fd6df6e07e8ec1b | 4 |
| 2020-10-31 | sg:57221f3709624c3b8fd6df6e07e8ec1b | 7 |
31 rows × 2 columns
visits dataframe returned by sg.calc_visits_by_day()¶visits.visits.plot().set_title(f"Visits by day to {store.street_address}")
Text(0.5, 1.0, 'Visits by day to 6460 Bustleton Ave')
As discussed earlier, Safegraph receives a ~10% sample of the available device data, so the foot traffic numbers reported in the dataset are a lot lower than the real life visits. In order to correct for this sampling, when required, we must use the visitor_home_cbgs column that disaggregates the visitors by their home census block group. We then use our supplementary datasets to extrapolate and estimate the real number of visits based on SafeGraph's reported sample size for each of the census block groups and the actual reported 2016 population of each census block group. We created the sg.calc_true_values_by_day() function to perform these calculations based on this example.
Below is the returned dataframe from calling sg.calc_true_visits_by_day() on the same store Series:
true_visits = sg.calc_true_visits_by_day(store)
true_visits
| id | visits | true_visits | |
|---|---|---|---|
| 2020-10-01 | sg:57221f3709624c3b8fd6df6e07e8ec1b | 29 | 125.297148 |
| 2020-10-02 | sg:57221f3709624c3b8fd6df6e07e8ec1b | 28 | 120.976557 |
| 2020-10-03 | sg:57221f3709624c3b8fd6df6e07e8ec1b | 21 | 90.732418 |
| 2020-10-04 | sg:57221f3709624c3b8fd6df6e07e8ec1b | 20 | 86.411826 |
| 2020-10-05 | sg:57221f3709624c3b8fd6df6e07e8ec1b | 26 | 112.335374 |
| ... | ... | ... | ... |
| 2020-10-27 | sg:57221f3709624c3b8fd6df6e07e8ec1b | 21 | 90.732418 |
| 2020-10-28 | sg:57221f3709624c3b8fd6df6e07e8ec1b | 11 | 47.526505 |
| 2020-10-29 | sg:57221f3709624c3b8fd6df6e07e8ec1b | 2 | 8.641183 |
| 2020-10-30 | sg:57221f3709624c3b8fd6df6e07e8ec1b | 4 | 17.282365 |
| 2020-10-31 | sg:57221f3709624c3b8fd6df6e07e8ec1b | 7 | 30.244139 |
31 rows × 3 columns
popularity_by_hour column¶#print original popularity_by_hour column
print(store.popularity_by_hour)
print()
#create a dictionary mapping of hour to popularity
d = dict(zip([*range(1,25)], eval(store.popularity_by_hour)))
print(d)
pd.DataFrame(data={"hour":d.keys(), "popularity":d.values()}).plot.bar('hour','popularity').set_title(f"Popularity by hour at {store.street_address}")
[21,21,26,20,22,31,61,73,57,59,63,48,61,70,62,54,68,65,69,55,43,41,35,29]
{1: 21, 2: 21, 3: 26, 4: 20, 5: 22, 6: 31, 7: 61, 8: 73, 9: 57, 10: 59, 11: 63, 12: 48, 13: 61, 14: 70, 15: 62, 16: 54, 17: 68, 18: 65, 19: 69, 20: 55, 21: 43, 22: 41, 23: 35, 24: 29}
Text(0.5, 1.0, 'Popularity by hour at 6460 Bustleton Ave')
device_type column¶print(store.device_type)
device_type = eval(store.device_type)
fig1, ax1 = plt.subplots()
ax1.pie(device_type.values(), labels=device_type.keys(), autopct='%1.1f%%',
shadow=False, startangle=90)
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
my_circle=plt.Circle( (0,0), 0.7, color='white')
p=plt.gcf()
p.gca().add_artist(my_circle)
plt.title(store.street_address)
plt.show()
{"android":353,"ios":161}
bucketed_dwell_time column¶print(store.bucketed_dwell_times)
d = eval(store.bucketed_dwell_times)
pd.DataFrame(data={'timeframe(minutes)':d.keys(), 'visits':d.values()}).plot.barh('timeframe(minutes)', 'visits').set_title(store.street_address)
{"<5":44,"5-10":393,"11-20":161,"21-60":85,"61-120":19,"121-240":11,">240":20}
Text(0.5, 1.0, '6460 Bustleton Ave')
related_same_day_brand column¶print(store.related_same_day_brand)
d = eval(store.related_same_day_brand)
squarify.plot(sizes=d.values(), label=d.keys(), alpha=.7)
plt.axis('off')
plt.show()
{"LUKOIL North America":50,"Philadelphia Sports Club":10,"Dunkin'":10,"Rite Aid":10,"The Home Depot":10,"McDonald's":10,"Family Dollar Stores":9,"Dollar Tree":8}
related_same_month_brand column¶d = eval(store.related_same_month_brand)
squarify.plot(sizes=d.values(), label=d.keys(), alpha=.7)
plt.axis('off')
plt.show()
By plotting this visualization, we can get a view of how far people are traveling to this store from their home location. It makes sense that there are a lot of visitors (block markers) that live near store (red marker), and it can be safely assumed that the visitors who live farther away visit that location either on their way to work or on the way to a different part of town.
Given that we only have data from the past few months, it makes sense that there are very few black markers vaery far away from the store given reduced travel due to COVID-19. If we can get access to any pre-covid data, it will be interesting to see how far the black markers get spread throughout the country as a possible micro-view into the overall tourism trends of Philadelphia.
visitor_cbgs = list(eval(store.visitor_home_cbgs))
location = cbgs[cbgs['census_block_group'] == store.poi_cbg].iloc[0]
folium_map = folium.Map(location=[location.latitude, location.longitude],
zoom_start=13,
tiles="OpenStreetMap")
for visitor in visitor_cbgs:
m = cbgs[cbgs['census_block_group'] == visitor].iloc[0]
folium.CircleMarker(location=[m.latitude, m.longitude], radius=5, color='black', fill=False)\
.add_child(folium.Popup(f"Distance to store: {round(geodesic([location.latitude, location.longitude], [m.latitude, m.longitude]).miles, 2)} miles"))\
.add_to(folium_map)
#add label with distance to store
folium.CircleMarker(location=[location.latitude, location.longitude], radius=10, color='red', fill=True).add_to(folium_map)
folium_map
folium_map = folium.Map(location=philly_center,
zoom_start=12,
tiles="OpenStreetMap")
for i, store_location in safegraph_wawa_october.iterrows():
folium.CircleMarker(location=[store_location.latitude, store_location.longitude], radius=4, color='red', fill=True).add_to(folium_map)
safegraph_starbucks_october = get_sg_data("starbucks", "oct")
for i, store_location in safegraph_starbucks_october.iterrows():
folium.CircleMarker(location=[store_location.latitude, store_location.longitude], radius=4, color='green', fill=True).add_to(folium_map)
folium_map
folium_map = folium.Map(location=philly_center,
zoom_start=12,
tiles="OpenStreetMap")
for i, store_location in safegraph_wawa_october.iterrows():
folium.CircleMarker(location=[store_location.latitude, store_location.longitude], radius=(store_location.raw_visit_counts * 0.02), color='red', fill=True).add_to(folium_map)
safegraph_starbucks_october = get_sg_data("starbucks", "oct")
for i, store_location in safegraph_starbucks_october.iterrows():
folium.CircleMarker(location=[store_location.latitude, store_location.longitude], radius=(store_location.raw_visit_counts * 0.02), color='green', fill=True).add_to(folium_map)
folium_map
s_visits = safegraph_starbucks_october['raw_visit_counts'].sum()/1000
w_visits = safegraph_wawa_october['raw_visit_counts'].sum()/1000
se_visits = safegraph_7_eleven_oct['raw_visit_counts'].sum()/1000
data = {'7-Eleven': se_visits, 'Wawa': w_visits}
fig = plt.figure(
FigureClass=Waffle,
rows=10,
values=data,
colors=("#00643C", "#A8202F"),
title={'label': '7-Eleven vs. Wawa: Foot Traffic Performance', 'loc': 'left'},
labels=["{0} ({1}%)".format(k, v) for k, v in data.items()],
legend={'loc': 'lower left', 'bbox_to_anchor': (0, -0.4), 'ncol': len(data), 'framealpha': 0},
)
fig.gca().set_facecolor('#EEEEEE')
fig.set_facecolor('#EEEEEE')
plt.show()
weather = pd.read_csv(f"{data_path}/raw/weather/weather.csv")
weather.columns = [x.lower() for x in list(weather.columns)]
weather['date'] = pd.to_datetime(weather['date'])
weather = weather.set_index('date')
station = "USC00286964"
weather = weather[weather.station == station]
weather.head()
| station | name | latitude | longitude | elevation | awnd | dapr | mdpr | pgtm | prcp | snow | snwd | tavg | tmax | tmin | tobs | wdf2 | wdf5 | wesd | wesf | wsf2 | wsf5 | wt01 | wt02 | wt03 | wt04 | wt05 | wt06 | wt08 | wt11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| date | ||||||||||||||||||||||||||||||
| 2020-01-01 | USC00286964 | PHILADELPHIA MT. HOLLY WEATHER FORECAST OFFICE... | 40.01369 | -74.81753 | 19.2 | NaN | NaN | NaN | NaN | 0.00 | 0.0 | 0.0 | NaN | 46.0 | 37.0 | 37.0 | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2020-01-02 | USC00286964 | PHILADELPHIA MT. HOLLY WEATHER FORECAST OFFICE... | 40.01369 | -74.81753 | 19.2 | NaN | NaN | NaN | NaN | 0.00 | 0.0 | 0.0 | NaN | 41.0 | 21.0 | 22.0 | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2020-01-03 | USC00286964 | PHILADELPHIA MT. HOLLY WEATHER FORECAST OFFICE... | 40.01369 | -74.81753 | 19.2 | NaN | NaN | NaN | NaN | 0.12 | 0.0 | 0.0 | NaN | 50.0 | 21.0 | 43.0 | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2020-01-04 | USC00286964 | PHILADELPHIA MT. HOLLY WEATHER FORECAST OFFICE... | 40.01369 | -74.81753 | 19.2 | NaN | NaN | NaN | NaN | 0.26 | 0.0 | 0.0 | NaN | 48.0 | 43.0 | 48.0 | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2020-01-05 | USC00286964 | PHILADELPHIA MT. HOLLY WEATHER FORECAST OFFICE... | 40.01369 | -74.81753 | 19.2 | NaN | NaN | NaN | NaN | 0.07 | 0.0 | 0.0 | NaN | 53.0 | 38.0 | 38.0 | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
city_mobility = pd.read_csv(f"{data_path}/raw/mobility/Google Mobility - City - Daily.csv").merge(pd.read_csv(f"{data_path}/raw/geo/GeoIDs - City.csv"), on='cityid')
county_mobility = pd.read_csv(f"{data_path}/raw/mobility/Google Mobility - County - Daily.csv").merge(pd.read_csv(f"{data_path}/raw/geo/GeoIDs - County.csv"), on='countyfips')
state_mobility = pd.read_csv(f"{data_path}/raw/mobility/Google Mobility - State - Daily.csv").merge(pd.read_csv(f"{data_path}/raw/geo/GeoIDs - State.csv"), on='statefips')
national_mobility = pd.read_csv(f"{data_path}/raw/mobility/Google Mobility - National - Daily.csv")
city_mobility.head()
| year | month | day | cityid | gps_retail_and_recreation | gps_grocery_and_pharmacy | gps_parks | gps_transit_stations | gps_workplaces | gps_residential | gps_away_from_home | cityname | stateabbrev | statename | statefips | city_pop2019 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2020 | 2 | 24 | 1 | 0.00571 | -0.00286 | .0714 | .00286 | 0.0214 | -0.00143 | 0.000625 | Los Angeles | CA | California | 6 | 10039107 |
| 1 | 2020 | 2 | 25 | 1 | 0.01000 | -0.00143 | .0743 | .00429 | 0.0229 | -0.00286 | 0.002020 | Los Angeles | CA | California | 6 | 10039107 |
| 2 | 2020 | 2 | 26 | 1 | 0.01570 | 0.00286 | .0714 | .00571 | 0.0243 | -0.00429 | 0.003620 | Los Angeles | CA | California | 6 | 10039107 |
| 3 | 2020 | 2 | 27 | 1 | 0.01860 | 0.00571 | .0757 | .00714 | 0.0243 | -0.00429 | 0.003620 | Los Angeles | CA | California | 6 | 10039107 |
| 4 | 2020 | 2 | 28 | 1 | 0.02290 | 0.01140 | .08 | .01 | 0.0257 | -0.00571 | 0.005000 | Los Angeles | CA | California | 6 | 10039107 |
combine_date_parts = lambda x: "-".join([str(x.year), str(x.month).zfill(2), str(x.day).zfill(2)])
city_mobility['Date'] = city_mobility.apply(combine_date_parts, axis=1)
city_mobility['Date'] = pd.to_datetime(city_mobility['Date'])
df = city_mobility[city_mobility.cityname == 'Philadelphia'].merge(sg.calc_true_visits_by_day(get_sg_data("wawa", "oct").iloc[0]).reset_index(), how='right', left_on='Date', right_on='index')
pop = 1584000 * 0.25
df['mobility'] = pop + (pop * df['gps_away_from_home'])
df[['Date', 'true_visits', 'mobility']].set_index('Date').corr()
| true_visits | mobility | |
|---|---|---|
| true_visits | 1.000000 | 0.479887 |
| mobility | 0.479887 | 1.000000 |
city_mobility['Date'] = pd.to_datetime(city_mobility['Date'])
df = city_mobility[city_mobility.cityname == 'Philadelphia'].merge(sg.calc_true_visits_by_day(get_sg_data("wawa", "oct").iloc[0]).reset_index(), how='right', left_on='Date', right_on='index')
df[['true_visits', 'gps_retail_and_recreation', 'gps_grocery_and_pharmacy', 'gps_parks', 'gps_transit_stations', 'gps_workplaces', 'gps_residential', 'gps_away_from_home']].corr()
| true_visits | gps_retail_and_recreation | gps_grocery_and_pharmacy | gps_workplaces | gps_residential | gps_away_from_home | |
|---|---|---|---|---|---|---|
| true_visits | 1.000000 | 0.587968 | 0.608726 | 0.412742 | -0.486473 | 0.479887 |
| gps_retail_and_recreation | 0.587968 | 1.000000 | 0.972060 | 0.204902 | -0.958927 | 0.967373 |
| gps_grocery_and_pharmacy | 0.608726 | 0.972060 | 1.000000 | 0.313941 | -0.932271 | 0.932119 |
| gps_workplaces | 0.412742 | 0.204902 | 0.313941 | 1.000000 | -0.149597 | 0.170817 |
| gps_residential | -0.486473 | -0.958927 | -0.932271 | -0.149597 | 1.000000 | -0.988547 |
| gps_away_from_home | 0.479887 | 0.967373 | 0.932119 | 0.170817 | -0.988547 | 1.000000 |